First, we need to read in the data and join the training and test data.
train_data <- read_csv("reddit_stress_data/dreaddit-train.csv")
## Rows: 2838 Columns: 116
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): subreddit, post_id, sentence_range, text
## dbl (112): id, label, confidence, social_timestamp, social_karma, syntax_ari...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_data <- read_csv("reddit_stress_data/dreaddit-test.csv")
## Rows: 715 Columns: 116
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): subreddit, post_id, sentence_range, text
## dbl (112): id, label, confidence, social_timestamp, social_karma, syntax_ari...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
reddit_stress_data <- add_row(train_data, test_data)
Now we need to find the word distributions. We’ll start by unnesting the tokens and training this on the full dataset.
words_tokenized <- reddit_stress_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words)
## Joining, by = "word"
words_tokenized_test <- test_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words)
## Joining, by = "word"
words_tokenized_train <- train_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words)
## Joining, by = "word"
label_counts <- reddit_stress_data %>%
group_by(label) %>%
count()
plot_ly(label_counts, x = ~label, y = ~n, type = "bar")
subreddit_counts <- reddit_stress_data %>%
group_by(subreddit) %>%
count()
plot_ly(subreddit_counts, x = ~subreddit, y = ~n, kind = "bar")
## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#bar
## Warning: 'bar' objects don't have these attributes: 'kind'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'xperiod', 'yperiod', 'xperiod0', 'yperiod0', 'xperiodalignment', 'yperiodalignment', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'textposition', 'insidetextanchor', 'textangle', 'textfont', 'insidetextfont', 'outsidetextfont', 'constraintext', 'cliponaxis', 'orientation', 'base', 'offset', 'width', 'marker', 'offsetgroup', 'alignmentgroup', 'selected', 'unselected', 'r', 't', '_deprecated', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
ggplot(reddit_stress_data, aes(y = label), height = 100, width = 50) + geom_boxplot(fill = "steelblue") + labs(title = "Labels by Subreddit") + facet_grid(label ~ subreddit)
First, to get a better idea of what the overall word distribution is, let’s plot all of the word frequencies.
my_top_word_counts <- words_tokenized %>%
count(word) %>%
arrange(desc(n))
ggplot(my_top_word_counts, aes(y = n)) + geom_boxplot(fill = "steelblue")
Although some words appear very frequently, most of the words barely appear compared to the top words. Let’s see what happens if we eliminate words based on whether they appear less than or equal to 5 times.
words_tokenized_rare_words_removed <- ReplaceRareWords(reddit_stress_data, rare_defn = 5)
words_tokenized_rare_words_removed %>%
count(word) %>%
arrange(desc(n)) %>%
ggplot(aes(y = n)) + geom_boxplot(fill = "steelblue")
There seem to still be significant outliers in the dataset. I’m going to look at the one significant outlier and see what it is.
words_tokenized_rare_words_removed %>%
count(word) %>%
arrange(desc(n)) %>%
head(5)
## # A tibble: 5 x 2
## word n
## <chr> <int>
## 1 unk 14453
## 2 text_im 2235
## 3 text_%d% 2210
## 4 text_dont 1184
## 5 text_feel 967
Defining the cut off for rare words removal to be 5 reduces the number of words somewhat but the data is still heavily right skewed. Let’s try setting the cut off to 15.
words_tokenized_rare_words_removed <- ReplaceRareWords(reddit_stress_data, rare_defn = 15)
words_tokenized_rare_words_removed %>%
count(word) %>%
arrange(desc(n)) %>%
ggplot(aes(y = n)) + geom_boxplot(fill = "steelblue")
We have removed a significant number of words but the data is still heavily right skewed. It looks like there is one significant outlier that is skewing the data.
words_tokenized_rare_words_removed %>%
count(word) %>%
arrange(desc(n)) %>%
head(5)
## # A tibble: 5 x 2
## word n
## <chr> <int>
## 1 unk 29766
## 2 text_im 2235
## 3 text_%d% 2210
## 4 text_dont 1184
## 5 text_feel 967
length(my_top_word_counts$word)
## [1] 12059
Since the number of words is over 12,000, it is difficult to see patterns among the data. Further, Another thing that may be helpful is to visualize the distribution of words in the dataset.
mean(my_top_word_counts$n)
## [1] 8.049092
sd(my_top_word_counts$n)
## [1] 41.04796
IQR(my_top_word_counts$n)
## [1] 4
median(my_top_word_counts$n)
## [1] 2
Now let’s see the most common words among the data (overall).
GetTopNMostCommonWords <- function(df, num) {
top_word_counts <- df %>%
count(word) %>%
arrange(desc(n))
return (head(top_word_counts, num))
}
num <- 20
top_10_full_data <- GetTopNMostCommonWords(words_tokenized, num)
Now I will plot the rop 20 most common words in the dataset
ggplot(top_10_full_data, aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Full Dataset", x = "Word", y = "Frequency")
Now let’s see how this varies among label: stressed or non-stressed.
stressed_data <- filter(words_tokenized, label == 0)
non_stressed_data <- filter(words_tokenized, label == 1)
Now let’s plot them
ggplot(GetTopNMostCommonWords(non_stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Non-Stressed Dataset", x = "Word", y = "Frequency")
Now let’s see the difference among stressed data.
ggplot(GetTopNMostCommonWords(stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Stressed Dataset", x = "Word", y = "Frequency")
# Exploring Differences in Subreddit Data Now we’re going to examine the differences by subreddit. First, we will see what unique subreddits have been selected. For each subreddit, I want to examine the difference between the labels and the different words among each label.
unique(reddit_stress_data$subreddit)
## [1] "ptsd" "assistance" "relationships" "survivorsofabuse"
## [5] "domesticviolence" "anxiety" "homeless" "stress"
## [9] "almosthomeless" "food_pantry"
I’m interested in understanding how the data is distributed among each of these subreddits.
ggplot(reddit_stress_data, aes(x = subreddit)) + geom_bar(fill = "steelblue")
Now, let’s check out the ptsd subreddit.
ptsd_data <- filter(words_tokenized, subreddit == "ptsd")
ggplot(filter(reddit_stress_data, subreddit == "ptsd"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "PTSD Data by Label")
ggplot(GetTopNMostCommonWords(ptsd_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the PTSD Subreddit", x = "Word", y = "Frequency")
## Domestic Violence Subreddit Now, let’s check out the domestic violence subreddit.
domestic_violence_data <- filter(words_tokenized, subreddit == "domesticviolence")
ggplot(filter(reddit_stress_data, subreddit == "domesticviolence"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Domestic Violence Data by Label")
ggplot(GetTopNMostCommonWords(domestic_violence_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Domestic Violence Subreddit", x = "Word", y = "Frequency")
## Almost Homeless Subreddit Now let’s check out the almost homeless subreddit
almost_homeless_data <- filter(words_tokenized, subreddit == "almosthomeless")
ggplot(filter(reddit_stress_data, subreddit == "almosthomeless"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Almost Homeless Data by Label")
ggplot(GetTopNMostCommonWords(almost_homeless_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Almost Homeless Subreddit", x = "Word", y = "Frequency")
## Assistance Subreddit
assistance_subreddit <- filter(words_tokenized, subreddit == "assistance")
ggplot(filter(reddit_stress_data, subreddit == "assistance"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Assistance Data by Label")
ggplot(GetTopNMostCommonWords(assistance_subreddit, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 20 Words from the Assistance Subreddit", x = "Word", y = "Frequency")
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment")
## By Label
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ label)
## By Subreddit
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ subreddit)
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_grid(subreddit ~ label)